In [1]:
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
data = pd.read_csv('salaries.csv')
# data = pd.read_csv('C:/Users/Neeraja/Desktop/IUB-DS/SEMESTER 3/DATA VIS/Data_Science_Fields_Salary_Categorization.csv')
In [3]:
data.head()
Out[3]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2022 EN FT Data Analytics Engineer 13000 USD 13000 AR 100 AR S
1 2022 SE FT Data Engineer 100000 USD 100000 US 0 US M
2 2022 SE FT Data Engineer 78000 USD 78000 US 0 US M
3 2022 SE FT Data Engineer 120000 USD 120000 US 0 US M
4 2022 SE FT Data Engineer 95000 USD 95000 US 0 US M
In [4]:
data.shape
Out[4]:
(1637, 11)
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1637 entries, 0 to 1636
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           1637 non-null   int64 
 1   experience_level    1637 non-null   object
 2   employment_type     1637 non-null   object
 3   job_title           1637 non-null   object
 4   salary              1637 non-null   int64 
 5   salary_currency     1637 non-null   object
 6   salary_in_usd       1637 non-null   int64 
 7   employee_residence  1637 non-null   object
 8   remote_ratio        1637 non-null   int64 
 9   company_location    1637 non-null   object
 10  company_size        1637 non-null   object
dtypes: int64(4), object(7)
memory usage: 140.8+ KB
In [6]:
data.describe().transpose()
Out[6]:
count mean std min 25% 50% 75% max
work_year 1637.0 2021.770922 0.518070 2020.0 2022.0 2022.0 2022.0 2022.0
salary 1637.0 223294.370800 985438.837723 5000.0 85000.0 130000.0 175100.0 30400000.0
salary_in_usd 1637.0 126509.493586 63103.689059 5000.0 80165.0 128000.0 168000.0 450000.0
remote_ratio 1637.0 58.827123 46.909032 0.0 0.0 100.0 100.0 100.0
In [7]:
 data.dtypes
Out[7]:
work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object
In [8]:
# data['salary_in_usd']=data['salary_in_usd'].apply(lambda x: int((x.split(".")[0]).replace(",","")))
data.dtypes
Out[8]:
work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object
In [9]:
# data.drop(columns=data.columns[0], axis=1, inplace=True)
In [10]:
data.head()
Out[10]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2022 EN FT Data Analytics Engineer 13000 USD 13000 AR 100 AR S
1 2022 SE FT Data Engineer 100000 USD 100000 US 0 US M
2 2022 SE FT Data Engineer 78000 USD 78000 US 0 US M
3 2022 SE FT Data Engineer 120000 USD 120000 US 0 US M
4 2022 SE FT Data Engineer 95000 USD 95000 US 0 US M
In [11]:
list(data.job_title.unique())
Out[11]:
['Data Analytics Engineer',
 'Data Engineer',
 'Data Specialist',
 'Data Analytics Consultant',
 'Data Scientist',
 'Data Analyst',
 'Machine Learning Engineer',
 'Machine Learning Software Engineer',
 '3D Computer Vision Researcher',
 'ML Engineer',
 'Machine Learning Researcher',
 'Data Architect',
 'Research Engineer',
 'Machine Learning Research Engineer',
 'Analytics Engineer',
 'Data Analytics Manager',
 'Data Science Consultant',
 'Deep Learning Engineer',
 'BI Data Analyst',
 'Data Science Manager',
 'Applied Scientist',
 'Data Management Specialist',
 'Research Scientist',
 'Autonomous Vehicle Technician',
 'Data Science Tech Lead',
 'BI Analyst',
 'Machine Learning Developer',
 'Machine Learning Scientist',
 'Data Scientist Lead',
 'Data Manager',
 'Cloud Data Engineer',
 'Head of Data',
 'Data Operations Analyst',
 'Data Operations Engineer',
 'Marketing Data Analyst',
 'Data Science Lead',
 'Power BI Developer',
 'Product Data Scientist',
 'Big Data Engineer',
 'Principal Data Architect',
 'Machine Learning Manager',
 'Lead Data Scientist',
 'Lead Machine Learning Engineer',
 'NLP Engineer',
 'ETL Developer',
 'AI Scientist',
 'Business Data Analyst',
 'Applied Machine Learning Scientist',
 'Data Engineering Manager',
 'Director of Data Science',
 'Financial Data Analyst',
 'Computer Vision Software Engineer',
 'Product Data Analyst',
 'Machine Learning Infrastructure Engineer',
 'Applied Data Scientist',
 'Cloud Data Architect',
 'Lead Data Engineer',
 'Head of Machine Learning',
 'Data Science Engineer',
 'Head of Data Science',
 'Computer Vision Engineer',
 'Principal Data Analyst',
 'Data Analytics Lead',
 'Principal Data Scientist',
 'Principal Data Engineer',
 'Lead Data Analyst',
 'Director of Data Engineering',
 'Big Data Architect',
 'Staff Data Scientist',
 'Finance Data Analyst']
In [12]:
data_Jobtitles = " ".join(list(data['job_title'].apply(lambda a: "".join(a.split(" ")))))
from wordcloud import WordCloud
from PIL import Image
wordcloud = WordCloud(width=2000, height=1800, margin=0, colormap='Blues').generate(data_Jobtitles)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.margins(x=0, y=0)
plt.show()
In [13]:
data.hist( layout = (2,2), bins = 20,figsize = (10,10),)
Out[13]:
array([[<AxesSubplot:title={'center':'work_year'}>,
        <AxesSubplot:title={'center':'salary'}>],
       [<AxesSubplot:title={'center':'salary_in_usd'}>,
        <AxesSubplot:title={'center':'remote_ratio'}>]], dtype=object)
In [14]:
data.isnull().sum() # checking for null values 
Out[14]:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

Data Analysis

In [15]:
import seaborn as sns
# import altair as alt
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import plotly.express as px
In [16]:
residence = data['employee_residence'].value_counts()
top10_employee_location = residence[:10]
fig = px.bar(y=top10_employee_location.values, 
             x=top10_employee_location.index, 
             color = top10_employee_location.index,
             
             text_auto=True,
             title= 'Top 10 Location of Employee',
             )
fig.update_layout(
    xaxis_title="Location of Employee",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic"))
fig.show()
In [17]:
data = data[data['employee_residence'].str.contains("US")]
data.head()
Out[17]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
1 2022 SE FT Data Engineer 100000 USD 100000 US 0 US M
2 2022 SE FT Data Engineer 78000 USD 78000 US 0 US M
3 2022 SE FT Data Engineer 120000 USD 120000 US 0 US M
4 2022 SE FT Data Engineer 95000 USD 95000 US 0 US M
5 2022 SE FT Data Specialist 110000 USD 110000 US 0 US M
In [18]:
plt.figure(figsize = (15,15))
plt.subplot(2,3,1)
sns.stripplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.stripplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.stripplot(x='experience_level', y='salary_in_usd', data=data)
plt.subplot(2,3,4)
sns.boxplot(x="employment_type",y="salary_in_usd",data=data)
plt.subplot(2,3,5)
sns.boxplot(x="company_size",y="salary_in_usd",data=data)
plt.subplot(2,3,6)
sns.boxplot(x="experience_level",y="salary_in_usd",data=data)
Out[18]:
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
In [19]:
plt.figure(figsize = (15,10))
#plt.figure(figsize = (35,25))
plt.subplot(2,3,1)
sns.swarmplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.swarmplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.swarmplot(x='experience_level', y='salary_in_usd', data=data)
/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

64.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

71.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

12.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

76.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

36.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

6.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

Out[19]:
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
In [20]:
plt.figure(figsize = (15,10))
#plt.figure(figsize = (35,25))
plt.subplot(2,3,1)
sns.violinplot(x='employment_type', y='salary_in_usd', data=data)
plt.subplot(2,3,2)
sns.violinplot(x='company_size', y='salary_in_usd', data=data)
plt.subplot(2,3,3)
sns.violinplot(x='experience_level', y='salary_in_usd', data=data)
Out[20]:
<AxesSubplot:xlabel='experience_level', ylabel='salary_in_usd'>
In [21]:
plt.figure(figsize = (25,13))
plt.subplot(2,3,1)
sns.swarmplot(x='employment_type', y='salary_in_usd', data=data,hue="company_size")

plt.subplot(2,3,2)
sns.violinplot(x='employment_type', y='salary_in_usd', data=data,hue="company_size")
/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning:

39.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

Out[21]:
<AxesSubplot:xlabel='employment_type', ylabel='salary_in_usd'>
In [22]:
# !pip install plotly
In [23]:
import plotly.express as px
In [24]:
# 
fig=go.Figure(px.scatter(data, x=data["employment_type"], y=data["salary_in_usd"],color=data["employment_type"]))

fig.update_layout(
    updatemenus=[
        dict(buttons=list([
                        dict(
                args=["type", "Scatter"],
                label="Scatter Plot",
                method="restyle"
            ),
            dict(
                args=["type", "violin"],
                label="Violin Plot",
                method="restyle"
            ),
            dict(
                args=["type", "box"],
                label="box Chart",
                method="restyle"
            )
        ]),
            direction="down",
        ),
    ]
)
 
fig.show()
In [25]:
 

fig=go.Figure(px.scatter(data, x=data["company_size"], y=data["salary_in_usd"],color=data["company_size"]))

fig.update_layout(
    updatemenus=[
        dict(buttons=list([
                        dict(
                args=["type", "Scatter"],
                label="Scatter Plot",
                method="restyle"
            ),
            dict(
                args=["type", "violin"],
                label="Violin Plot",
                method="restyle"
            ),
            dict(
                args=["type", "box"],
                label="box Chart",
                method="restyle"
            )
        ]),
            direction="down",
        ),
    ]
)
 
fig.show()
In [26]:
 

fig=go.Figure(px.scatter(data, x=data["experience_level"], y=data["salary_in_usd"],color=data["experience_level"]))

fig.update_layout(
    updatemenus=[
        dict(buttons=list([
                        dict(
                args=["type", "Scatter"],
                label="Scatter Plot",
                method="restyle"
            ),
            dict(
                args=["type", "violin"],
                label="Violin Plot",
                method="restyle"
            ),
            dict(
                args=["type", "box"],
                label="box Chart",
                method="restyle"
            )
        ]),
            direction="down",
        ),
    ]
)
 
fig.show()
In [ ]:
 
In [27]:
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.bar(x = ['Fully Remote','Partially Remote','No Remote Work'], 
       y = data['remote_ratio'].value_counts().values,
       color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Remote Working Ratio Distribution',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Remote Type",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [28]:
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.histogram(x = data["work_year"],nbins=5 ,color=data["remote_ratio"],
      
       #color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
      title = 'Year Vs Remote Working ratio',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Remote Type",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [29]:
# Entery level/ Junior jobs that are remote and paying morethan median salary in Large Companies
data_en_remote_pay_G_Median_L = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_remote_pay_G_Median_L
Out[29]:
job_title company_location
939 Data Analyst US
1007 Machine Learning Developer US
1441 Machine Learning Scientist US
In [ ]:
 
In [30]:
sal_list = list(data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['salary_in_usd']].salary_in_usd)
sal_list
Out[30]:
[150000, 180000, 225000]
In [31]:
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.bar(x = data_en_remote_pay_G_Median_L['job_title'], 
       y = sal_list,
       color = data_en_remote_pay_G_Median_L['job_title'].value_counts().values,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Entery level/ Junior jobs that are remote and paying morethan median in Large Companies',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Job Title",
    yaxis_title="Salary",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [32]:
# Senior-level / Expert jobs that are remort and paying morethan median in Large Companies
data_SE_remote_pay_G_Median_L = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_remote_pay_G_Median_L.head()
Out[32]:
job_title company_location
559 Lead Data Scientist US
581 Data Engineer US
602 Data Scientist Lead US
605 Data Scientist US
606 Data Scientist US
In [33]:
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.bar(x = data_SE_remote_pay_G_Median_L['job_title'].unique(), 
       y = data_SE_remote_pay_G_Median_L['job_title'].value_counts().values,
#        color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Senior-level / Expert jobs that are remote and paying morethan median in Large Companies',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Job Title",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [34]:
# Mid-level / Intermediate that are remort and paying morethan median in Large Companies
data_MI_remote_pay_G_Median_L = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_remote_pay_G_Median_L.head()
Out[34]:
job_title company_location
516 ML Engineer US
664 Business Data Analyst US
1008 Data Scientist US
1108 Machine Learning Scientist US
1309 Applied Data Scientist US
In [35]:
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.bar(x = data_MI_remote_pay_G_Median_L['job_title'].unique(), 
       y = data_MI_remote_pay_G_Median_L['job_title'].value_counts().values,
#        color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Mid-level / Intermediate jobs that are remote and paying morethan median in Large Companies',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Job Title",
    yaxis_title="count",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [36]:
# Executive-level / Director jobs that are remort and paying morethan median in Large Companies
data_EX_remote_pay_G_Median_L = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_remote_pay_G_Median_L

sal_list1 = list(data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) & (data['company_size']=='L') &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['salary_in_usd']].salary_in_usd)
sal_list1
Out[36]:
[235000, 325000]
In [37]:
# Interactive Vizualizations
remote_type = ['Fully Remote','Partially Remote','No Remote Work']

fig = go.Figure()
fig=px.bar(x = data_EX_remote_pay_G_Median_L['job_title'].unique(), 
       y = sal_list1,
#        color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Executive-level / Director jobs that are remote and paying morethan median in Large Companies',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="Job Title",
    yaxis_title="Salary",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [38]:
# Entery level/ Junior jobs that are `remote` and paying morethan median 
data_en_remote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_remote_pay_G_Median

# Senior-level / Expert jobs that are remote and paying morethan median 
data_SE_remote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_remote_pay_G_Median.head()

# Mid-level / Intermediate that are remote and paying morethan median 
data_MI_remote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_remote_pay_G_Median.head()

# Executive-level / Director jobs that are remote and paying morethan median
data_EX_remote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==100) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_remote_pay_G_Median.head()
Out[38]:
job_title company_location
158 Data Engineer US
159 Data Engineer US
401 Analytics Engineer US
402 Analytics Engineer US
590 Data Engineer US
In [39]:
# Entry level/ Junior jobs that are `partially remote` and paying morethan median 
data_en_ParRemote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_ParRemote_pay_G_Median

# Senior-level / Expert jobs that are remote and paying morethan median 
data_SE_ParRemote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_ParRemote_pay_G_Median.head()

# Mid-level / Intermediate that are remote and paying morethan median 
data_MI_ParRemote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_ParRemote_pay_G_Median.head()

# Executive-level / Director jobs that are remote and paying morethan median
data_EX_ParRemote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==50) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_ParRemote_pay_G_Median
Out[39]:
job_title company_location

Jobs that are not remote and paying morethan median¶

In [40]:
# Entery level/ Junior jobs that are `not remote` and paying morethan median 
data_en_NotRemote_pay_G_Median = data[(data['experience_level'] == 'EN') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_en_NotRemote_pay_G_Median

# Senior-level / Expert jobs that are not remote and paying morethan median 
data_SE_NotRemote_pay_G_Median = data[(data['experience_level'] == 'SE') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_SE_NotRemote_pay_G_Median.head()

# Mid-level / Intermediate that are not remote and paying morethan median 
data_MI_NotRemote_pay_G_Median = data[(data['experience_level'] == 'MI') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_MI_NotRemote_pay_G_Median.head()

# Executive-level / Director jobs that are not remote and paying morethan median
data_EX_NotRemote_pay_G_Median = data[(data['experience_level'] == 'EX') & (data['remote_ratio']==0) &(data['salary_in_usd'] >= np.median(data['salary_in_usd']))][['job_title', 'company_location']]
data_EX_NotRemote_pay_G_Median.head()
Out[40]:
job_title company_location
325 Head of Data US
326 Head of Data US
706 Data Science Manager US
707 Data Science Manager US
947 Data Science Manager US
In [ ]:
 
In [41]:
# Different types of Employment roles
# FullTime vs Contract jobs
data_fulltimevsParttime = data[['employment_type', 'salary_in_usd']].groupby('employment_type').count().rename(columns={'salary_in_usd': 'No_of_jobs'}).sort_values('No_of_jobs', ascending=False)
data_fulltimevsParttime.reset_index(inplace=True)
data_fulltimevsParttime.head()
Out[41]:
employment_type No_of_jobs
0 FT 1167
1 CT 5
In [42]:
# Jobs that pay morethan median
data_JobsMedian = data[['job_title', 'salary']].groupby('job_title').count().rename(columns={'salary': 'No_of_positions'}).sort_values('No_of_positions', ascending=False)
# df_dec_pos.reset_index(inplace=True)
data_JobsMedian.head()
Out[42]:
No_of_positions
job_title
Data Engineer 333
Data Scientist 298
Data Analyst 198
Machine Learning Engineer 65
Data Architect 43
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [43]:
correlation = data.corr().round(2)
In [44]:
fig = px.imshow(correlation, text_auto=True)
fig.show()
In [45]:
data1=data.groupby(['job_title']).size() .sort_values(ascending=False) .reset_index(name='count')  
data2=data1.head(10)
In [46]:
x=data2["job_title"]
y=data2["count"]

fig = go.Figure()
fig=px.bar(x = x, 
       y =y,
       color = x,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
       title = 'Top 10 Job Titles',
       #template='plotly_dark'
            )
    
fig.update_layout(


    xaxis_title="job_title",
    yaxis_title="Count",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig.show()
In [ ]:
 
In [47]:
data1=data.groupby(['salary_in_usd','job_title']).size().reset_index( )
data2=(data1[-15:])
In [48]:
fig1 = go.Figure()
fig1=px.bar(x=data2['job_title'],y=data2['salary_in_usd'],color=data2['salary_in_usd'],
      
       #color = remote_type,
       #color_discrete_sequence=px.colors.sequential.dense,
       text_auto=True,
      # title = 'Remote Working Ratio Distribution',
       #template='plotly_dark'
            )
    
fig1.update_layout(


    xaxis_title="job_title",
    yaxis_title="Salary",
    font = dict(size=17,family="Franklin Gothic")
)    
# showing the plot
fig1.show()
In [ ]:
 
In [ ]: